Research question

To what degree is it possible to predict the number of runs a baseball team will score in a game using known statistical data available before the start of the game?

Description

A final model containing all features. Training set is composed of game-by-game WHIP, DER and OPS. Test set is season to date average WHIP, DER and OPS.

### load packages
# install.packages("FNN")
# install.packages("rpart")
# install.packages("rpart.plot")
# install.packages("rattle")    
# install.packages("ggplot2")
# install.packages("knitr")
# install.packages("kableExtra")
# install.packages("gridExtra")
# install.packages("tidyverse")
# install.packages("dplyr")
# install.packages("Boruta")
# install.packages("Metrics")

### set working directory
# setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
setwd("/Users/geoffcooper/Dropbox/CKME136/ckme136/FinalResults_20200727")
wd = getwd() 

### open tor_games raw data
datafile = paste(wd,"/tor_games_20200727.csv",sep="")
tor_games = read.csv(datafile) 
head(tor_games)

1. Create datasets

1a. Training dataset - game-by-game WHIP, DER and OPS

tor_games_training <- data.frame(tor_games$tor_runs,tor_games$game_park_factor,tor_games$tor_home,tor_games$opp_pitcher_whip_game,tor_games$opp_team_der_game,tor_games$drurb001_ops_game,tor_games$bichb001_ops_game,tor_games$biggc002_ops_game,tor_games$davij007_ops_game,tor_games$fishd001_ops_game,tor_games$galvf001_ops_game,tor_games$gricr001_ops_game,tor_games$guerv002_ops_game,tor_games$hernt002_ops_game,tor_games$gurrl001_ops_game,tor_games$jansd001_ops_game,tor_games$maill001_ops_game,tor_games$mcgur002_ops_game,tor_games$mckib001_ops_game,tor_games$pillk001_ops_game,tor_games$smoaj001_ops_game,tor_games$sogae001_ops_game,tor_games$tellr001_ops_game)
names(tor_games_training) <- c('tor_runs','game_park_factor','tor_home','opp_pitcher_whip','opp_team_der','drurb001_ops','bichb001_ops','biggc002_ops','davij007_ops','fishd001_ops','galvf001_ops','gricr001_ops','guerv002_ops','hernt002_ops','gurrl001_ops','jansd001_ops','maill001_ops','mcgur002_ops','mckib001_ops','pillk001_ops','smoaj001_ops','sogae001_ops','tellr001_ops')
head(tor_games_training)

1b. Test dataset - season to date average WHIP, DER and OPS, last 150 games

library(dplyr)
tor_games_test <- data.frame(tor_games$game_date,tor_games$tor_runs,tor_games$game_park_factor,tor_games$tor_home,tor_games$opp_pitcher_whip_todate,tor_games$opp_team_der_todate,tor_games$drurb001_ops_todate,tor_games$bichb001_ops_todate,tor_games$biggc002_ops_todate,tor_games$davij007_ops_todate,tor_games$fishd001_ops_todate,tor_games$galvf001_ops_todate,tor_games$gricr001_ops_todate,tor_games$guerv002_ops_todate,tor_games$hernt002_ops_todate,tor_games$gurrl001_ops_todate,tor_games$jansd001_ops_todate,tor_games$maill001_ops_todate,tor_games$mcgur002_ops_todate,tor_games$mckib001_ops_todate,tor_games$pillk001_ops_todate,tor_games$smoaj001_ops_todate,tor_games$sogae001_ops_todate,tor_games$tellr001_ops_todate)
names(tor_games_test) <- c('game_date','tor_runs','game_park_factor','tor_home','opp_pitcher_whip','opp_team_der','drurb001_ops','bichb001_ops','biggc002_ops','davij007_ops','fishd001_ops','galvf001_ops','gricr001_ops','guerv002_ops','hernt002_ops','gurrl001_ops','jansd001_ops','maill001_ops','mcgur002_ops','mckib001_ops','pillk001_ops','smoaj001_ops','sogae001_ops','tellr001_ops')
tor_games_test <- tor_games_test %>% filter(game_date > 20190409)
tor_games_test = subset(tor_games_test, select = -c(game_date))
head(tor_games_test)

2. Distribution of variables

2a. Dependent variable (y) - Runs/Game by Toronto Blue Jays (2019 Season)

library(ggplot2)
ggplot(tor_games_training, aes(x=tor_runs, y=..count..)) + geom_histogram(binwidth=1, alpha=0.5, color="darkblue", fill="lightblue") + geom_density(alpha=0, color="darkblue") +  labs(title="Histogram of Toronto Blue Jays runs scored (y) - 2019 season") + geom_vline(aes(xintercept=mean(tor_runs)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(tor_runs), label=round(mean(tor_runs),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()

2b. Independent variable (x3) - Opposition pitcher WHIP

require(gridExtra)
whip_training <- ggplot(tor_games_training, aes(x=opp_pitcher_whip, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.5,4.5),breaks=seq(0,4,0.5)) + scale_y_continuous(limits=c(0,60),breaks=seq(0,50,10)) + geom_density(aes(y=0.25*..count..), alpha=0, color="darkblue") + labs(title="Histogram of Opposition pitcher\nWHIP (x3) - Training set (by game)") + geom_vline(aes(xintercept=mean(opp_pitcher_whip)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(opp_pitcher_whip), label=round(mean(opp_pitcher_whip),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
whip_test <- ggplot(tor_games_test, aes(x=opp_pitcher_whip, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.5,4.5),breaks=seq(0,4,0.5)) + scale_y_continuous(limits=c(0,60),breaks=seq(0,50,10)) + geom_density(aes(y=0.25*..count..), alpha=0, color="darkblue") + labs(title="Histogram of Opposition pitcher\nWHIP (x3) - Test set (average to date)") + geom_vline(aes(xintercept=mean(opp_pitcher_whip)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(opp_pitcher_whip), label=round(mean(opp_pitcher_whip),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
grid.arrange(whip_training, whip_test, ncol=2)

2c. Independent variable (x4) - Opposition team DER

require(gridExtra)
der_training <- ggplot(tor_games_training, aes(x=opp_team_der, y=..count..)) + geom_histogram(binwidth=0.1, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(0.4,1.1),breaks=seq(0.5,1,0.1)) + scale_y_continuous(limits=c(0,90),breaks=seq(0,70,10)) + geom_density(aes(y=0.1*..count..), alpha=0, color="darkblue") + labs(title="Histogram of Opposition team DER\n(x4) - Training set (by game)") + geom_vline(aes(xintercept=mean(opp_team_der)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(opp_team_der), label=round(mean(opp_team_der),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
der_test <- ggplot(tor_games_test, aes(x=opp_team_der, y=..count..)) + geom_histogram(binwidth=0.02, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(0.4,1.1),breaks=seq(0.5,1,0.1)) + scale_y_continuous(limits=c(0,90),breaks=seq(0,70,10)) + geom_density(aes(y=0.02*..count..), alpha=0, color="darkblue") + labs(title="Histogram of Opposition team DER\n(x4) - Test set (average to date)") + geom_vline(aes(xintercept=mean(opp_team_der)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(opp_team_der), label=round(mean(opp_team_der),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
grid.arrange(der_training, der_test, ncol=2)

2d. Independent variable (x7) - Cavan Biggio OPS

Cavan Biggio

require(gridExtra)
biggc002_training <- ggplot(tor_games_training, aes(x=biggc002_ops, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.25,3),breaks=seq(0,3,0.5)) + scale_y_continuous(limits=c(0,90),breaks=seq(0,90,10)) + geom_density(aes(y=0.25*..count..),alpha=0,color="darkblue") + labs(title="Histogram of Cavan Biggio OPS\n(x7) - Training set (by game)") + geom_vline(aes(xintercept=mean(biggc002_ops)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(biggc002_ops), label=round(mean(biggc002_ops),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
biggc002_test <- ggplot(tor_games_test, aes(x=biggc002_ops, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.25,3),breaks=seq(0,3,0.5)) + scale_y_continuous(limits=c(0,90),breaks=seq(0,90,10)) + geom_density(aes(y=0.25*..count..),alpha=0, color="darkblue") + labs(title="Histogram of Cavan Biggio OPS\n(x7) - Test set (average to date)") + geom_vline(aes(xintercept=mean(biggc002_ops)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(biggc002_ops), label=round(mean(biggc002_ops),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
grid.arrange(biggc002_training, biggc002_test, ncol=2)

2e. Independent variable (x11) - Randal Grichuk OPS

Randal Grichuk

require(gridExtra)
gricr001_training <- ggplot(tor_games_training, aes(x=gricr001_ops, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.25,3),breaks=seq(0,3,0.5)) + scale_y_continuous(limits=c(0,540),breaks=seq(0,160,40)) + geom_density(aes(y=0.25*..count..),alpha=0,color="darkblue") + labs(title="Histogram of Randal Grichuk OPS\n(x11) - Training set (by game)") + geom_vline(aes(xintercept=mean(gricr001_ops)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(gricr001_ops), label=round(mean(gricr001_ops),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
gricr001_test <- ggplot(tor_games_test, aes(x=gricr001_ops, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.25,3),breaks=seq(0,3,0.5)) + scale_y_continuous(limits=c(0,540),breaks=seq(0,160,40)) + geom_density(aes(y=0.25*..count..),alpha=0, color="darkblue") + labs(title="Histogram of Randal Grichuk OPS\n(x11) - Test set (average to date)") + geom_vline(aes(xintercept=mean(gricr001_ops)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(gricr001_ops), label=round(mean(gricr001_ops),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
grid.arrange(gricr001_training, gricr001_test, ncol=2)

2f. Independent variable (x12) - Vladimir Guerrero Jr OPS

Vladimir Guerrero Jr

require(gridExtra)
guerv002_training <- ggplot(tor_games_training, aes(x=guerv002_ops, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.25,3),breaks=seq(0,3,0.5)) + scale_y_continuous(limits=c(0,130),breaks=seq(0,120,20)) + geom_density(aes(y=0.25*..count..),alpha=0,color="darkblue") + labs(title="Histogram of Vladimir Guerrero Jr\nOPS (x12) - Training set (by game)") + geom_vline(aes(xintercept=mean(guerv002_ops)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(guerv002_ops), label=round(mean(guerv002_ops),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
guerv002_test <- ggplot(tor_games_test, aes(x=guerv002_ops, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.25,3),breaks=seq(0,3,0.5)) + scale_y_continuous(limits=c(0,130),breaks=seq(0,120,20)) + geom_density(aes(y=0.25*..count..),alpha=0, color="darkblue") + labs(title="Histogram of Vladimir Guerrero Jr\nOPS (x12) - Test set (average to date)") + geom_vline(aes(xintercept=mean(guerv002_ops)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(guerv002_ops), label=round(mean(guerv002_ops),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
grid.arrange(guerv002_training, guerv002_test, ncol=2)

3. Train regression models

models_compare_training <- data.frame("Model"=character(),"Correlation_r"=double(),"Root_mean_squared_error"=double(),"Error_distribution"=character())
models_compare_test <- data.frame("Model"=character(),"Correlation_r"=double(),"Root_mean_squared_error"=double(),"Error_distribution"=character())

3a. Linear regression model

Train model - Linear regression

model_linreg <- lm(tor_runs~game_park_factor+tor_home+opp_pitcher_whip+opp_team_der+drurb001_ops+bichb001_ops+biggc002_ops+davij007_ops+fishd001_ops+galvf001_ops+gricr001_ops+guerv002_ops+hernt002_ops+gurrl001_ops+jansd001_ops+maill001_ops+mcgur002_ops+mckib001_ops+pillk001_ops+smoaj001_ops+sogae001_ops+tellr001_ops, tor_games_training)
summary(model_linreg)
## 
## Call:
## lm(formula = tor_runs ~ game_park_factor + tor_home + opp_pitcher_whip + 
##     opp_team_der + drurb001_ops + bichb001_ops + biggc002_ops + 
##     davij007_ops + fishd001_ops + galvf001_ops + gricr001_ops + 
##     guerv002_ops + hernt002_ops + gurrl001_ops + jansd001_ops + 
##     maill001_ops + mcgur002_ops + mckib001_ops + pillk001_ops + 
##     smoaj001_ops + sogae001_ops + tellr001_ops, data = tor_games_training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9750 -1.0246 -0.0079  1.0842  6.2977 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.04572    2.19414  -0.477 0.634397    
## game_park_factor  1.64124    1.52906   1.073 0.284968    
## tor_home          0.06331    0.30299   0.209 0.834797    
## opp_pitcher_whip  0.61667    0.24050   2.564 0.011406 *  
## opp_team_der     -3.55534    1.90159  -1.870 0.063634 .  
## drurb001_ops      1.05968    0.26672   3.973 0.000114 ***
## bichb001_ops      0.97259    0.33756   2.881 0.004590 ** 
## biggc002_ops      0.96562    0.21894   4.411 2.05e-05 ***
## davij007_ops      1.37884    0.56310   2.449 0.015585 *  
## fishd001_ops      0.65759    0.45278   1.452 0.148661    
## galvf001_ops      0.69456    0.23043   3.014 0.003063 ** 
## gricr001_ops      0.85121    0.22450   3.792 0.000222 ***
## guerv002_ops      1.04550    0.23294   4.488 1.49e-05 ***
## hernt002_ops      0.97801    0.17225   5.678 7.66e-08 ***
## gurrl001_ops      1.08369    0.24157   4.486 1.51e-05 ***
## jansd001_ops      1.03540    0.26243   3.945 0.000126 ***
## maill001_ops      1.43475    0.48841   2.938 0.003874 ** 
## mcgur002_ops      0.29216    0.35494   0.823 0.411848    
## mckib001_ops      1.15146    0.29050   3.964 0.000118 ***
## pillk001_ops      7.22145    3.56851   2.024 0.044922 *  
## smoaj001_ops      0.36627    0.23319   1.571 0.118533    
## sogae001_ops      0.89263    0.26014   3.431 0.000791 ***
## tellr001_ops      0.98589    0.23834   4.136 6.08e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.721 on 139 degrees of freedom
## Multiple R-squared:  0.7555, Adjusted R-squared:  0.7168 
## F-statistic: 19.53 on 22 and 139 DF,  p-value: < 2.2e-16

Evaluate model - Training data - Linear regression

library(Metrics)
## Warning: package 'Metrics' was built under R version 4.0.2
linreg_predict_training <- predict(model_linreg, tor_games_training)
linreg_actuals_preds_training <- data.frame(cbind(actuals=tor_games_training$tor_runs, predicteds=linreg_predict_training))
correlation_accuracy_training <- cor(linreg_actuals_preds_training$actuals, y=linreg_actuals_preds_training$predicteds, method=c("spearman"))
# head(linreg_actuals_preds_training)
print("Correlation (r) - Predicted vs Actual")
## [1] "Correlation (r) - Predicted vs Actual"
correlation_accuracy_training
## [1] 0.8729085
linreg_preds_errors <- (linreg_actuals_preds_training$actuals - linreg_actuals_preds_training$predicteds)
linreg_rmse = rmse(linreg_actuals_preds_training$actuals, linreg_actuals_preds_training$predicteds)
print("Root mean squared error")
## [1] "Root mean squared error"
linreg_rmse
## [1] 1.594344
print("Error distribution")
## [1] "Error distribution"
sw <- shapiro.test(linreg_preds_errors)
linreg_preds_errors_dist <- if(sw$p.value>0.05) "Normal" else "Not normal"
linreg_preds_errors_dist
## [1] "Normal"
models_compare_training <- rbind(models_compare_training, data.frame("Model"="Linear regression","Correlation_r"=correlation_accuracy_training,"Root_mean_squared_error"=linreg_rmse,"Error_distribution"=linreg_preds_errors_dist))
models_compare_test <- rbind(models_compare_test, data.frame("Model"="Linear regression (Training)","Correlation_r"=correlation_accuracy_training,"Root_mean_squared_error"=linreg_rmse,"Error_distribution"=linreg_preds_errors_dist))
plot(linreg_actuals_preds_training$actuals, linreg_actuals_preds_training$predicteds, main="Plot of runs/game - Linear regression model - Training data", xlab="Actual (y)", ylab="Predicted (Å·)")
abline(lm(linreg_actuals_preds_training$actuals~linreg_actuals_preds_training$predicteds), col="red")

Histogram of runs/game, Training data, Actual vs Predicted - Linear regression model

library(ggplot2)
actuals <- data.frame(tor_games_training$tor_runs)
actuals$dataset <- 'Actual'
names(actuals) <- c('Runs','Dataset')
predicteds <- data.frame(linreg_predict_training)
predicteds$dataset <- 'Predicted'
names(predicteds) <- c('Runs','Dataset')
runs <- rbind(actuals, predicteds)
ggplot(runs, aes(x=Runs, y=..count.., color=Dataset, fill=Dataset)) + geom_histogram(binwidth=1, alpha=0.5, position="dodge") + geom_density(alpha=0) + scale_fill_manual(values=c("darkblue", "lightblue")) + 
scale_color_manual(values=c("darkblue", "lightblue")) + labs(title="Histogram of Runs/Game by Toronto Blue Jays (2019 Season)\nLinear regression model - Actual vs Predicted",x="Runs", y="Games") + theme_classic()

3b. k-NN regression model

Train model - k-NN regression

library(dplyr)
library(FNN)
knn_tor_games_actuals <- tor_games_training %>% select(tor_runs)
knn_tor_games_training <- tor_games_training %>% select(-tor_runs)
model_knn <- knn.reg(knn_tor_games_training, test=NULL, knn_tor_games_actuals, k=12)
model_knn
## PRESS =  1125.403 
## R2-Predict =  NA

Evaluate model - Training data - k-NN regression

library(Metrics)
knn_cor <- cor(knn_tor_games_actuals$tor_runs, y=model_knn$pred, method=c("spearman"))
# head(model_knn$pred)
print("Correlation (r) - Predicted vs Actual")
## [1] "Correlation (r) - Predicted vs Actual"
knn_cor
## [1] 0.7536747
knn_preds_errors <- (knn_tor_games_actuals$tor_runs - model_knn$pred)
knn_rmse = rmse(knn_tor_games_actuals$tor_runs, model_knn$pred)
print("Root mean squared error")
## [1] "Root mean squared error"
knn_rmse
## [1] 2.635703
print("Error distribution")
## [1] "Error distribution"
sw <- shapiro.test(knn_preds_errors)
knn_preds_errors_dist <- if(sw$p.value>0.05) "Normal" else "Not normal"
knn_preds_errors_dist
## [1] "Not normal"
models_compare_training <- rbind(models_compare_training, data.frame("Model"="k-NN regression","Correlation_r"=knn_cor,"Root_mean_squared_error"=knn_rmse,"Error_distribution"=knn_preds_errors_dist))
plot(knn_tor_games_actuals$tor_runs, model_knn$pred, main="Plot of runs/game - k-NN regression model - Training data", xlab="Actual (y)", ylab="Predicted (Å·)")
abline(lm(model_knn$pred~knn_tor_games_actuals$tor_runs), col="red")

Histogram of runs/game, Training data, Actual vs Predicted - k-NN regression model

library(ggplot2)
actuals <- data.frame(knn_tor_games_actuals$tor_runs)
actuals$dataset <- 'Actual'
names(actuals) <- c('Runs','Dataset')
predicteds <- data.frame(model_knn$pred)
predicteds$dataset <- 'Predicted'
names(predicteds) <- c('Runs','Dataset')
runs <- rbind(actuals, predicteds)
ggplot(runs, aes(x=Runs, y=..count.., color=Dataset, fill=Dataset)) + geom_histogram(binwidth=1, alpha=0.5, position="dodge") + geom_density(alpha=0) + scale_fill_manual(values=c("darkblue", "lightblue")) + 
scale_color_manual(values=c("darkblue", "lightblue")) + labs(title="Histogram of Runs/Game by Toronto Blue Jays (2019 Season)\nk-NN regression model - Actual vs Predicted",x="Runs", y="Games") + theme_classic()

3c. Regression tree model

Train model - Regression tree

library(rpart)
model_regtree <- rpart(tor_runs~game_park_factor+tor_home+opp_pitcher_whip+opp_team_der+drurb001_ops+bichb001_ops+biggc002_ops+davij007_ops+fishd001_ops+galvf001_ops+gricr001_ops+guerv002_ops+hernt002_ops+gurrl001_ops+jansd001_ops+maill001_ops+mcgur002_ops+mckib001_ops+pillk001_ops+smoaj001_ops+sogae001_ops+tellr001_ops, method="anova", data=tor_games_training)
summary(model_regtree)
## Call:
## rpart(formula = tor_runs ~ game_park_factor + tor_home + opp_pitcher_whip + 
##     opp_team_der + drurb001_ops + bichb001_ops + biggc002_ops + 
##     davij007_ops + fishd001_ops + galvf001_ops + gricr001_ops + 
##     guerv002_ops + hernt002_ops + gurrl001_ops + jansd001_ops + 
##     maill001_ops + mcgur002_ops + mckib001_ops + pillk001_ops + 
##     smoaj001_ops + sogae001_ops + tellr001_ops, data = tor_games_training, 
##     method = "anova")
##   n= 162 
## 
##            CP nsplit rel error    xerror       xstd
## 1  0.35721276      0 1.0000000 1.0104050 0.15005391
## 2  0.10122683      1 0.6427872 0.7489114 0.11961378
## 3  0.07836614      2 0.5415604 0.6990594 0.11651070
## 4  0.03420656      3 0.4631943 0.5989963 0.08902639
## 5  0.02999152      4 0.4289877 0.6554938 0.09522038
## 6  0.02440076      5 0.3989962 0.6842977 0.09706661
## 7  0.02103845      6 0.3745954 0.6471846 0.09468305
## 8  0.01662439      7 0.3535570 0.6162963 0.08675508
## 9  0.01175462      8 0.3369326 0.6146063 0.08646160
## 10 0.01077065      9 0.3251780 0.6178646 0.08633277
## 11 0.01000000     10 0.3144073 0.6170655 0.08632511
## 
## Variable importance
## opp_pitcher_whip     opp_team_der     biggc002_ops     guerv002_ops 
##               36               22               10                5 
##     smoaj001_ops     gricr001_ops     drurb001_ops     fishd001_ops 
##                4                4                3                3 
##     mcgur002_ops     jansd001_ops     galvf001_ops game_park_factor 
##                3                3                1                1 
##         tor_home     gurrl001_ops     sogae001_ops     tellr001_ops 
##                1                1                1                1 
##     bichb001_ops 
##                1 
## 
## Node number 1: 162 observations,    complexity param=0.3572128
##   mean=4.481481, MSE=10.39781 
##   left son=2 (113 obs) right son=3 (49 obs)
##   Primary splits:
##       opp_pitcher_whip < 1.55   to the left,  improve=0.3572128, (0 missing)
##       opp_team_der     < 0.7245 to the right, improve=0.2791761, (0 missing)
##       guerv002_ops     < 1.05   to the left,  improve=0.2106361, (0 missing)
##       biggc002_ops     < 0.55   to the left,  improve=0.1808705, (0 missing)
##       hernt002_ops     < 0.775  to the left,  improve=0.1527696, (0 missing)
##   Surrogate splits:
##       opp_team_der < 0.587  to the right, agree=0.747, adj=0.163, (0 split)
##       drurb001_ops < 1.325  to the left,  agree=0.728, adj=0.102, (0 split)
##       biggc002_ops < 1.8165 to the left,  agree=0.728, adj=0.102, (0 split)
##       smoaj001_ops < 1.883  to the left,  agree=0.728, adj=0.102, (0 split)
##       jansd001_ops < 1.775  to the left,  agree=0.722, adj=0.082, (0 split)
## 
## Node number 2: 113 observations,    complexity param=0.1012268
##   mean=3.212389, MSE=5.671705 
##   left son=4 (71 obs) right son=5 (42 obs)
##   Primary splits:
##       opp_team_der     < 0.7235 to the right, improve=0.2660482, (0 missing)
##       biggc002_ops     < 1.3335 to the left,  improve=0.2056460, (0 missing)
##       guerv002_ops     < 1.05   to the left,  improve=0.1748234, (0 missing)
##       opp_pitcher_whip < 1.134  to the left,  improve=0.1571339, (0 missing)
##       gricr001_ops     < 0.7915 to the left,  improve=0.1481078, (0 missing)
##   Surrogate splits:
##       biggc002_ops     < 0.775  to the left,  agree=0.735, adj=0.286, (0 split)
##       opp_pitcher_whip < 1.0625 to the left,  agree=0.699, adj=0.190, (0 split)
##       guerv002_ops     < 1.05   to the left,  agree=0.681, adj=0.143, (0 split)
##       mcgur002_ops     < 0.25   to the left,  agree=0.673, adj=0.119, (0 split)
##       gricr001_ops     < 0.55   to the left,  agree=0.664, adj=0.095, (0 split)
## 
## Node number 3: 49 observations,    complexity param=0.07836614
##   mean=7.408163, MSE=9.017076 
##   left son=6 (42 obs) right son=7 (7 obs)
##   Primary splits:
##       opp_team_der     < 0.558  to the right, improve=0.2987606, (0 missing)
##       hernt002_ops     < 0.8165 to the left,  improve=0.2228298, (0 missing)
##       guerv002_ops     < 1.0165 to the left,  improve=0.2214362, (0 missing)
##       gricr001_ops     < 1.256  to the left,  improve=0.1493609, (0 missing)
##       opp_pitcher_whip < 2.9635 to the left,  improve=0.1382294, (0 missing)
## 
## Node number 4: 71 observations,    complexity param=0.02440076
##   mean=2.267606, MSE=3.041063 
##   left son=8 (52 obs) right son=9 (19 obs)
##   Primary splits:
##       gricr001_ops     < 0.7915 to the left,  improve=0.1903603, (0 missing)
##       opp_pitcher_whip < 0.845  to the left,  improve=0.1667133, (0 missing)
##       hernt002_ops     < 1.125  to the left,  improve=0.1464811, (0 missing)
##       jansd001_ops     < 0.3665 to the left,  improve=0.1226668, (0 missing)
##       opp_team_der     < 0.8975 to the right, improve=0.0766997, (0 missing)
##   Surrogate splits:
##       opp_pitcher_whip < 1.21   to the left,  agree=0.845, adj=0.421, (0 split)
##       sogae001_ops     < 1.4585 to the left,  agree=0.803, adj=0.263, (0 split)
##       game_park_factor < 0.9    to the right, agree=0.746, adj=0.053, (0 split)
##       bichb001_ops     < 0.125  to the left,  agree=0.746, adj=0.053, (0 split)
## 
## Node number 5: 42 observations,    complexity param=0.03420656
##   mean=4.809524, MSE=6.058957 
##   left son=10 (35 obs) right son=11 (7 obs)
##   Primary splits:
##       biggc002_ops     < 1.3335 to the left,  improve=0.2264222, (0 missing)
##       game_park_factor < 1.047  to the left,  improve=0.2253941, (0 missing)
##       guerv002_ops     < 1.05   to the left,  improve=0.2077189, (0 missing)
##       opp_team_der     < 0.613  to the right, improve=0.1383982, (0 missing)
##       galvf001_ops     < 1.175  to the left,  improve=0.1197605, (0 missing)
## 
## Node number 6: 42 observations,    complexity param=0.02999152
##   mean=6.738095, MSE=5.431406 
##   left son=12 (7 obs) right son=13 (35 obs)
##   Primary splits:
##       fishd001_ops < 0.125  to the right, improve=0.2214591, (0 missing)
##       guerv002_ops < 1.3    to the left,  improve=0.2214591, (0 missing)
##       mcgur002_ops < 0.45   to the right, improve=0.1964096, (0 missing)
##       bichb001_ops < 0.2    to the right, improve=0.1344972, (0 missing)
##       gricr001_ops < 1.256  to the left,  improve=0.1237658, (0 missing)
##   Surrogate splits:
##       mcgur002_ops < 0.45   to the right, agree=0.952, adj=0.714, (0 split)
##       bichb001_ops < 0.2    to the right, agree=0.857, adj=0.143, (0 split)
##       biggc002_ops < 1.919  to the right, agree=0.857, adj=0.143, (0 split)
## 
## Node number 7: 7 observations
##   mean=11.42857, MSE=11.67347 
## 
## Node number 8: 52 observations
##   mean=1.807692, MSE=2.039941 
## 
## Node number 9: 19 observations
##   mean=3.526316, MSE=3.617729 
## 
## Node number 10: 35 observations,    complexity param=0.01662439
##   mean=4.285714, MSE=4.37551 
##   left son=20 (25 obs) right son=21 (10 obs)
##   Primary splits:
##       guerv002_ops     < 0.825  to the left,  improve=0.18285450, (0 missing)
##       gricr001_ops     < 1.3415 to the left,  improve=0.16791040, (0 missing)
##       game_park_factor < 1.0245 to the left,  improve=0.15236320, (0 missing)
##       sogae001_ops     < 0.883  to the left,  improve=0.14109140, (0 missing)
##       hernt002_ops     < 0.9165 to the left,  improve=0.09444963, (0 missing)
##   Surrogate splits:
##       fishd001_ops     < 0.125  to the left,  agree=0.800, adj=0.3, (0 split)
##       game_park_factor < 1.047  to the left,  agree=0.771, adj=0.2, (0 split)
##       opp_team_der     < 0.568  to the right, agree=0.771, adj=0.2, (0 split)
##       galvf001_ops     < 1.175  to the left,  agree=0.771, adj=0.2, (0 split)
##       gricr001_ops     < 0.125  to the right, agree=0.771, adj=0.2, (0 split)
## 
## Node number 11: 7 observations
##   mean=7.428571, MSE=6.244898 
## 
## Node number 12: 7 observations
##   mean=4.285714, MSE=0.7755102 
## 
## Node number 13: 35 observations,    complexity param=0.02103845
##   mean=7.228571, MSE=4.919184 
##   left son=26 (15 obs) right son=27 (20 obs)
##   Primary splits:
##       guerv002_ops < 0.3    to the left,  improve=0.20583030, (0 missing)
##       gricr001_ops < 0.55   to the left,  improve=0.12809200, (0 missing)
##       hernt002_ops < 1.1    to the left,  improve=0.12391030, (0 missing)
##       biggc002_ops < 0.95   to the left,  improve=0.11745480, (0 missing)
##       smoaj001_ops < 0.225  to the right, improve=0.05837042, (0 missing)
##   Surrogate splits:
##       smoaj001_ops < 0.625  to the right, agree=0.714, adj=0.333, (0 split)
##       tellr001_ops < 0.325  to the right, agree=0.686, adj=0.267, (0 split)
##       opp_team_der < 0.712  to the right, agree=0.657, adj=0.200, (0 split)
##       galvf001_ops < 0.9    to the right, agree=0.657, adj=0.200, (0 split)
##       gricr001_ops < 0.1    to the left,  agree=0.657, adj=0.200, (0 split)
## 
## Node number 20: 25 observations,    complexity param=0.01077065
##   mean=3.72, MSE=3.6416 
##   left son=40 (12 obs) right son=41 (13 obs)
##   Primary splits:
##       game_park_factor < 1.0245 to the left,  improve=0.1992812, (0 missing)
##       guerv002_ops     < 0.2    to the right, improve=0.1549448, (0 missing)
##       tor_home         < 0.5    to the left,  improve=0.1470140, (0 missing)
##       gricr001_ops     < 0.7915 to the left,  improve=0.1230257, (0 missing)
##       biggc002_ops     < 0.45   to the right, improve=0.1080146, (0 missing)
##   Surrogate splits:
##       tor_home     < 0.5    to the left,  agree=0.92, adj=0.833, (0 split)
##       guerv002_ops < 0.45   to the right, agree=0.68, adj=0.333, (0 split)
##       biggc002_ops < 0.325  to the right, agree=0.64, adj=0.250, (0 split)
##       davij007_ops < 0.4585 to the right, agree=0.64, adj=0.250, (0 split)
##       galvf001_ops < 0.125  to the right, agree=0.64, adj=0.250, (0 split)
## 
## Node number 21: 10 observations
##   mean=5.7, MSE=3.41 
## 
## Node number 26: 15 observations
##   mean=6.066667, MSE=1.262222 
## 
## Node number 27: 20 observations,    complexity param=0.01175462
##   mean=8.1, MSE=5.89 
##   left son=54 (9 obs) right son=55 (11 obs)
##   Primary splits:
##       opp_pitcher_whip < 2.155  to the right, improve=0.1680815, (0 missing)
##       hernt002_ops     < 0.9    to the left,  improve=0.1613650, (0 missing)
##       gricr001_ops     < 0.55   to the left,  improve=0.1369553, (0 missing)
##       mckib001_ops     < 0.225  to the left,  improve=0.1189162, (0 missing)
##       jansd001_ops     < 0.575  to the left,  improve=0.0994235, (0 missing)
##   Surrogate splits:
##       gurrl001_ops < 0.8165 to the left,  agree=0.80, adj=0.556, (0 split)
##       galvf001_ops < 0.775  to the left,  agree=0.75, adj=0.444, (0 split)
##       hernt002_ops < 0.343  to the left,  agree=0.70, adj=0.333, (0 split)
##       jansd001_ops < 0.575  to the left,  agree=0.70, adj=0.333, (0 split)
##       mckib001_ops < 0.525  to the right, agree=0.70, adj=0.333, (0 split)
## 
## Node number 40: 12 observations
##   mean=2.833333, MSE=1.805556 
## 
## Node number 41: 13 observations
##   mean=4.538462, MSE=3.940828 
## 
## Node number 54: 9 observations
##   mean=7, MSE=5.333333 
## 
## Node number 55: 11 observations
##   mean=9, MSE=4.545455
library(rpart.plot)
library(rattle)
fancyRpartPlot(model_regtree, main="Regression tree model") 

Evaluate model - Training data - Regression tree

library(Metrics)
regtree_preds <- predict(model_regtree,type = "vector")
# head(regtree_preds)
regtree_cor <- cor(tor_games_training$tor_runs, y=regtree_preds, method=c("spearman"))
print("Correlation (r) - Predicted vs Actual")
## [1] "Correlation (r) - Predicted vs Actual"
regtree_cor
## [1] 0.813598
regtree_preds_errors <- (tor_games_training$tor_runs - regtree_preds)
regtree_rmse = rmse(tor_games_training$tor_runs, regtree_preds)
print("Root mean squared error")
## [1] "Root mean squared error"
regtree_rmse
## [1] 1.808078
print("Error distribution")
## [1] "Error distribution"
sw <- shapiro.test(regtree_preds_errors)
regtree_preds_errors_dist <- if(sw$p.value>0.05) "Normal" else "Not normal"
regtree_preds_errors_dist
## [1] "Not normal"
models_compare_training <- rbind(models_compare_training, data.frame("Model"="Regression tree","Correlation_r"=regtree_cor,"Root_mean_squared_error"=regtree_rmse,"Error_distribution"=regtree_preds_errors_dist))
plot(tor_games_training$tor_runs, jitter(regtree_preds,2), main="Plot of runs/game - Regression tree model - Training data", xlab="Actual (y)", ylab="Predicted (Å·)")
abline(lm(regtree_preds~tor_games_training$tor_runs), col="red")

Histogram of runs/game, Training data, Actual vs Predicted - Regression tree model

library(ggplot2)
actuals <- data.frame(tor_games_training$tor_runs)
actuals$dataset <- 'Actual'
names(actuals) <- c('Runs','Dataset')
predicteds <- data.frame(regtree_preds)
predicteds$dataset <- 'Predicted'
names(predicteds) <- c('Runs','Dataset')
runs <- rbind(actuals, predicteds)
ggplot(runs, aes(x=Runs, y=..count.., color=Dataset, fill=Dataset)) + geom_histogram(binwidth=1, alpha=0.5, position="dodge") + geom_density(alpha=0) + scale_fill_manual(values=c("darkblue", "lightblue")) + 
scale_color_manual(values=c("darkblue", "lightblue")) + labs(title="Histogram of Runs/Game by Toronto Blue Jays (2019 Season)\nRegression tree model - Actual vs Predicted",x="Runs", y="Games") + theme_classic()

3d. Compare models

library(knitr)
library(kableExtra)
models_compare_training %>% kable() %>% kable_styling()
Model Correlation_r Root_mean_squared_error Error_distribution
Linear regression 0.8729085 1.594344 Normal
k-NN regression 0.7536747 2.635703 Not normal
Regression tree 0.8135980 1.808078 Not normal

4. Selected model - Linear regression

4a. Evaluate selected model - Test data - Linear regression - baseline

library(Metrics)
linreg_predict_test <- predict(model_linreg, tor_games_test)
linreg_actuals_preds_test <- data.frame(cbind(actuals=tor_games_test$tor_runs, predicteds=linreg_predict_test))
correlation_accuracy_test <- cor(linreg_actuals_preds_test$actuals, y=linreg_actuals_preds_test$predicteds, method=c("spearman"))
# head(linreg_actuals_preds_test)
print("Correlation (r) - Predicted vs Actual")
## [1] "Correlation (r) - Predicted vs Actual"
correlation_accuracy_test
## [1] 0.23505
linreg_preds_errors <- (linreg_actuals_preds_test$actuals - linreg_actuals_preds_test$predicteds)
linreg_rmse = rmse(linreg_actuals_preds_test$actuals, linreg_actuals_preds_test$predicteds)
print("Root mean squared error")
## [1] "Root mean squared error"
linreg_rmse
## [1] 3.1885
print("Error distribution")
## [1] "Error distribution"
sw <- shapiro.test(linreg_preds_errors)
linreg_preds_errors_dist <- if(sw$p.value>0.05) "Normal" else "Not normal"
linreg_preds_errors_dist
## [1] "Not normal"
models_compare_test <- rbind(models_compare_test, data.frame("Model"="Linear regression (Test - baseline)","Correlation_r"=correlation_accuracy_test,"Root_mean_squared_error"=linreg_rmse,"Error_distribution"=linreg_preds_errors_dist))
plot(linreg_actuals_preds_test$actuals, linreg_actuals_preds_test$predicteds, main="Plot of runs/game - Linear regression model - Test data - baseline", xlab="Actual (y)", ylab="Predicted (Å·)")
abline(lm(linreg_actuals_preds_test$actuals~linreg_actuals_preds_test$predicteds), col="red")

Histogram of runs/game, Test data, Actual vs Predicted - Linear regression - baseline

actuals <- data.frame(tor_games_test$tor_runs)
actuals$dataset <- 'Actual'
names(actuals) <- c('Runs','Dataset')
predicteds <- data.frame(linreg_predict_test)
predicteds$dataset <- 'Predicted'
names(predicteds) <- c('Runs','Dataset')
runs <- rbind(actuals, predicteds)
ggplot(runs, aes(x=Runs, y=..count.., color=Dataset, fill=Dataset)) + geom_histogram(binwidth=1, alpha=0.5, position="dodge") + geom_density(alpha=0) + scale_fill_manual(values=c("darkblue", "lightblue")) + 
scale_color_manual(values=c("darkblue", "lightblue")) + labs(title="Histogram of Runs/Game by Toronto Blue Jays (2019 Season)\nActual vs Predicted - Test Set - baseline",x="Runs", y="Games") + theme_classic()

4b. Revise model using feature selection

library(Boruta)
boruta_output <- Boruta(tor_runs~., data=tor_games_training, doTrace=0, maxRuns=500)
boruta_output
## Boruta performed 312 iterations in 9.817127 secs.
##  12 attributes confirmed important: biggc002_ops, drurb001_ops,
## game_park_factor, gricr001_ops, guerv002_ops and 7 more;
##  10 attributes confirmed unimportant: bichb001_ops, davij007_ops,
## fishd001_ops, galvf001_ops, maill001_ops and 5 more;
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")

boruta_table <- attStats(boruta_output)
boruta_table
### remove outlier, one game where runs > 13
tor_games_training <- tor_games_training %>% filter(tor_runs < 13)
tor_games_test <- tor_games_test %>% filter(tor_runs < 13)

4c. Train model - Linear regression - final

model_linreg_final <- lm(tor_runs~opp_pitcher_whip+opp_team_der+biggc002_ops+gricr001_ops+guerv002_ops+hernt002_ops+gurrl001_ops+jansd001_ops, tor_games_training)
summary(model_linreg_final)
## 
## Call:
## lm(formula = tor_runs ~ opp_pitcher_whip + opp_team_der + biggc002_ops + 
##     gricr001_ops + guerv002_ops + hernt002_ops + gurrl001_ops + 
##     jansd001_ops, data = tor_games_training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.3417 -1.2306 -0.4019  1.2832  5.3150 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        5.9452     1.5713   3.784 0.000222 ***
## opp_pitcher_whip   1.2809     0.2283   5.611 9.28e-08 ***
## opp_team_der      -7.0450     1.8123  -3.887 0.000151 ***
## biggc002_ops       0.6065     0.2167   2.799 0.005789 ** 
## gricr001_ops       0.5769     0.2291   2.518 0.012837 *  
## guerv002_ops       0.6686     0.2367   2.825 0.005368 ** 
## hernt002_ops       0.6735     0.1789   3.764 0.000238 ***
## gurrl001_ops       0.8806     0.2351   3.746 0.000254 ***
## jansd001_ops       0.5427     0.2644   2.052 0.041872 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.887 on 152 degrees of freedom
## Multiple R-squared:  0.6325, Adjusted R-squared:  0.6132 
## F-statistic:  32.7 on 8 and 152 DF,  p-value: < 2.2e-16

4d. Evaluate model - Linear regression - final

library(Metrics)
linreg_predict_final <- predict(model_linreg_final, tor_games_test)
linreg_actuals_preds_test <- data.frame(cbind(actuals=tor_games_test$tor_runs, predicteds=linreg_predict_final))
correlation_accuracy_test <- cor(linreg_actuals_preds_test$actuals, y=linreg_actuals_preds_test$predicteds, method=c("spearman"))
# head(linreg_actuals_preds_test)
print("Correlation (r) - Predicted vs Actual")
## [1] "Correlation (r) - Predicted vs Actual"
correlation_accuracy_test
## [1] 0.2550327
linreg_preds_errors <- (linreg_actuals_preds_test$actuals - linreg_actuals_preds_test$predicteds)
linreg_rmse = rmse(linreg_actuals_preds_test$actuals, linreg_actuals_preds_test$predicteds)
print("Root mean squared error")
## [1] "Root mean squared error"
linreg_rmse
## [1] 2.943055
print("Error distribution")
## [1] "Error distribution"
sw <- shapiro.test(linreg_preds_errors)
linreg_preds_errors_dist <- if(sw$p.value>0.05) "Normal" else "Not normal"
linreg_preds_errors_dist
## [1] "Not normal"
models_compare_test <- rbind(models_compare_test, data.frame("Model"="Linear regression (Test - final)","Correlation_r"=correlation_accuracy_test,"Root_mean_squared_error"=linreg_rmse,"Error_distribution"=linreg_preds_errors_dist))
plot(linreg_actuals_preds_test$actuals, linreg_actuals_preds_test$predicteds, main="Plot of runs/game - Linear regression model - Test data - final", xlab="Actual (y)", ylab="Predicted (Å·)")
abline(lm(linreg_actuals_preds_test$actuals~linreg_actuals_preds_test$predicteds), col="red")

Histogram of runs/game, Test data, Actual vs Predicted - Linear regression - final

actuals <- data.frame(tor_games_test$tor_runs)
actuals$dataset <- 'Actual'
names(actuals) <- c('Runs','Dataset')
predicteds <- data.frame(linreg_predict_final)
predicteds$dataset <- 'Predicted'
names(predicteds) <- c('Runs','Dataset')
runs <- rbind(actuals, predicteds)
ggplot(runs, aes(x=Runs, y=..count.., color=Dataset, fill=Dataset)) + geom_histogram(binwidth=1, alpha=0.5, position="dodge") + geom_density(alpha=0) + scale_fill_manual(values=c("darkblue", "lightblue")) + 
scale_color_manual(values=c("darkblue", "lightblue")) + labs(title="Histogram of Runs/Game by Toronto Blue Jays (2019 Season)\nActual vs Predicted - Test Set - final",x="Runs", y="Games") + theme_classic()

4e. Compare models

library(knitr)
library(kableExtra)
models_compare_test %>% kable() %>% kable_styling()
Model Correlation_r Root_mean_squared_error Error_distribution
Linear regression (Training) 0.8729085 1.594344 Normal
Linear regression (Test - baseline) 0.2350500 3.188500 Not normal
Linear regression (Test - final) 0.2550327 2.943055 Not normal